{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tokenizers import ByteLevelBPETokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = ByteLevelBPETokenizer()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "chinese-standard.jsonl.dedup\r\n",
      "filter-malay-rojak-en.jsonl.dedup\r\n",
      "filter-malay-rojak-ms.jsonl.dedup\r\n",
      "filter-malay-rojak-rojak.jsonl.dedup\r\n",
      "filter-twitter-id.jsonl.dedup\r\n",
      "filter-twitter-malay-rojak-id.jsonl.dedup\r\n",
      "filter-twitter-malay-rojak-ms.jsonl.dedup\r\n",
      "filter-twitter-malay-rojak-rojak.jsonl.dedup\r\n",
      "kaskus.jsonl.dedup\r\n",
      "local-mandarin.jsonl.dedup\r\n",
      "others.jsonl.dedup\r\n",
      "others-sample-50k.jsonl.dedup\r\n",
      "others-sample.jsonl.dedup\r\n",
      "prepare-english-en.jsonl.dedup\r\n",
      "prepare-english-ms.jsonl.dedup\r\n",
      "prepare-indon.jsonl.dedup\r\n",
      "prepare-indon-standard.jsonl.dedup\r\n",
      "prepare-malay-ms.jsonl.dedup\r\n",
      "prepare-manglish-en.jsonl.dedup\r\n",
      "prepare-manglish-manglish.jsonl.dedup\r\n",
      "prepare-standard-mandarin.jsonl.dedup\r\n",
      "standard-mandarin.jsonl.dedup\r\n"
     ]
    }
   ],
   "source": [
    "!ls *.dedup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "prepare-malay-ms.jsonl.dedup\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "999999it [00:02, 355505.11it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "prepare-manglish-manglish.jsonl.dedup\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "999999it [00:02, 374691.15it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "prepare-english-ms.jsonl.dedup\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "999999it [00:02, 371460.86it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "kaskus.jsonl.dedup\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "999999it [00:02, 489280.27it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "local-mandarin.jsonl.dedup\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "999999it [00:05, 176232.96it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "standard-mandarin.jsonl.dedup\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "999999it [00:03, 323622.91it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "others-sample.jsonl.dedup\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "999999it [00:02, 438037.92it/s]\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "from tqdm import tqdm\n",
    "\n",
    "files = [\n",
    "    'prepare-malay-ms.jsonl.dedup',\n",
    "    'prepare-manglish-manglish.jsonl.dedup',\n",
    "    'prepare-english-ms.jsonl.dedup',\n",
    "    'kaskus.jsonl.dedup',\n",
    "    'local-mandarin.jsonl.dedup',\n",
    "    'standard-mandarin.jsonl.dedup',\n",
    "    'others-sample.jsonl.dedup'\n",
    "]\n",
    "\n",
    "with open('texts', 'w') as fopen_l:\n",
    "    for f in files:\n",
    "        print(f)\n",
    "        count = 0\n",
    "        with open(f) as fopen:\n",
    "            for l in tqdm(fopen):\n",
    "                data = json.loads(l)\n",
    "                fopen_l.write(f'{data}\\n')\n",
    "                count += 1\n",
    "                if count >= 1e6:\n",
    "                    break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "tokenizer.train(\n",
    "    ['texts'],\n",
    "    vocab_size=400000,\n",
    "    min_frequency=2,\n",
    "    show_progress=True,\n",
    "    special_tokens=[\"<s>\", \"<pad>\", \"</s>\"],\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer.save('bpe-400k')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer\n",
    "from transformers import PreTrainedTokenizerFast"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = PreTrainedTokenizerFast(tokenizer_file='bpe-400k')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "r = re.compile(r'[\\S]+').findall"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 2.75 ms, sys: 0 ns, total: 2.75 ms\n",
      "Wall time: 2.47 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'tak Ġsuka Ġayam 1 2 3 1 2 4 1 2 3'"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "strings = [\n",
    "    'tak suka ayam123124123',\n",
    "    'tak suka ayam'\n",
    "]\n",
    "subs = [' '.join(tokenizer.tokenize(s)) for s in strings]\n",
    "subs[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "vocabulary = list(tokenizer.vocab.keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "\n",
    "bow = CountVectorizer(vocabulary = vocabulary, token_pattern = r'[\\S]+').fit(subs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "\n",
    "with open('bow.pkl', 'wb') as fopen:\n",
    "    pickle.dump(bow, fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.naive_bayes import ComplementNB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "nb = ComplementNB().fit(bow.transform(subs), [0,1])\n",
    "nb = nb.partial_fit(bow.transform(subs), [0,1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ComplementNB()"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('nb.pkl', 'wb') as fopen:\n",
    "    pickle.dump(nb, fopen)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
